



bdw = 3500


#################################################
##   Load the data                          #####
#################################################

setwd(wd_data)
data = read.csv('main_dataset.csv')
data = data[!is.na(data$sruT),]

data$running_hlm = ifelse(data$policy_period<=3, round(data$hlm_pre_p, 2) - .20, round(data$hlm_pre_p, 2) - .25 )
data$policy_period_ind = ifelse((is.na(data$sru_period)), 0, data$policy_period - data$sru_period)

data$policy_year_ind = ifelse((is.na(data$sru_year)), 0, data$year - data$sru_year)

data$dummy = ifelse(data$running>0, 1, 0)
data$dummy_hlm = ifelse(data$running_hlm<0, 1, 0)

data$res_own_share = data$num_res_own/data$num_res
data$hlm_share_res = data$num_hlm/data$num_res
data$hlm_share_log = data$num_hlm/data$num_log

#################################################
##   Subset the sample For Diff in Diff     #####
#################################################

# constant variables over time (e.g. region)
df2 = data[which(data$year == 2006), c('CODGEO', 'reg', 'dep', 'agglo_name', 'sruT', 'running',
                                       'running_hlm', 'exemp', 'dummy', 'agglo_sru', 'border_agglo')]


# pre-treatment variables from 1999 or 1990
df1 = data[which(data$year == 1999), c('CODGEO', 'libcom', 'sru_period', 'imm_share_90', 'hlm_share', 'res_own_share', 'running_pre', 'imm', 'pop', 'fn_1995')]
df1$imm_share_99 = df1$imm/df1$pop
df1$fn_1995 = df1$fn_1995/100
df1$imm = NULL
df1$pop = NULL
colnames(df1) = c('CODGEO', 'libcom', 'sru_period', 'imm_share_90', 'hlm_share_99', 'res_owned_share_99', 'running_pre', 'imm_share_99', 'fn_1995')


# post-treatment variables: 2006 to 2015 - mean
df3 = data[which(data$year >= 2006), c('CODGEO', 'num_hlm', 'hlm_share_res', 'hlm_share_log', 'res_own_share','imm', 'num_imm', 'fn')]
df3 = aggregate(. ~ CODGEO, FUN = mean, data=df3, rm.na=T)

# lepen vote in 2017
df4a = data[which(data$year == 1999), c('CODGEO', 'fn')]
colnames(df4a) =  c('CODGEO', 'fn_02')
df4b = data[which(data$year == 2007), c('CODGEO', 'fn')]
colnames(df4b) =  c('CODGEO', 'fn_07')
df4c = data[which(data$year == 2012), c('CODGEO', 'fn')]
colnames(df4c) =  c('CODGEO', 'fn_12')
df4d = data[which(data$year == 2017), c('CODGEO', 'fn')]
colnames(df4d) =  c('CODGEO', 'fn_17')

df4 = merge(df4a, df4b, by='CODGEO')
df4 = merge(df4, df4c, by='CODGEO')
df4 = merge(df4, df4d, by='CODGEO')


# post-treatment variables on competition for public housing: 2006 to 2015 mean
df5 = data[which(data$year >= 2006), c('CODGEO', "hlm_active_dem", "hlm_active_dem_fr", "hlm_active_dem_exc", "hlm_active_dem_eu",
                                       "hlm_closed_dem", "hlm_closed_dem_fr", "hlm_closed_dem_exc", "hlm_closed_dem_eu")]
df5 = aggregate(. ~ CODGEO, FUN = mean, data=df5, rm.na=T)


# Merge all together 
prova = df1   
prova = merge(prova, df2, on='CODGEO', all.x = T)
prova = merge(prova, df4, on='CODGEO', all.x = T)
prova = merge(prova, df3, on='CODGEO', all.x = T)
prova = merge(prova, df5, on='CODGEO', all.x = T)

########################################
# Subset
########################################

# Keep only urban agglomerations with at least one treated municipality
df_did = prova[which(prova$agglo_sru==1), ]

# Keep only untreated municipalities, or municipalities treated in the first period
df_did = df_did[which(df_did$sru_period==0 | is.na(df_did$sru_period)),]

# Keep only municipalities within the bandwith
df_did = df_did[which(abs(df_did$running_pre)<bdw), ]


df_did = df_did[which(!is.na(df_did$imm_share_99) & !is.na(df_did$hlm_share_99) & !is.na(df_did$hlm_share_log)),]



rm(prova, df1, df2, df3, df4, df5, data, df4a, df4b, df4c, df4d)

